Portfolio

Author

Hans Capener

Librares
library(mosaic)
library(tidyverse)
library(plotly)
library(reticulate)

ggplot2

Data manipulation + boxplot code
food <- read_csv("data/food.csv")

food1 <- food %>% 
  select(c(GPA, father_education, mother_education))

food2 <- food1 %>% 
mutate(
parent_education = case_when(
    mother_education == 1 & father_education == 1 ~
    "BHSD",
    mother_education %in% c(1,2) & father_education == 2 | 
    mother_education == 2 & father_education == 1 ~ 
    "1HS",
    mother_education %in% c(1,2,3) & father_education == 3 |
    mother_education == 3 & father_education %in% c(1,2) ~
    "1SC",
    mother_education %in% c(1,2,3,4) & father_education == 4 |
    mother_education == 4 & father_education %in% c(1,2,3) ~
    "1BD",
    mother_education %in% c(1,2,3,4) & father_education == 5 |
    mother_education == 5 & father_education %in% c(1,2,3,4) ~
    "1GD",
    mother_education == 5 & father_education == 5 ~
    "BGD"
),
GPA = as.numeric(GPA)
) %>% 
na.omit()

food2 <- food2 %>% 
  mutate(
    parent_education = fct_relevel(parent_education,
                                        "BHSD", "1HS", "1SC",
                                        "1BD", "1GD", "BGD")
  )

ggplot(food2, aes(x=parent_education, y=GPA)) +
  geom_boxplot(fill=c('cyan3','deepskyblue','deepskyblue1','deepskyblue2','deepskyblue3','deepskyblue4')) +
  geom_jitter(color="black", size=1, alpha=0.9, width= 0.25) +
  labs(title= "How do Parent's Education Levels Effect College Student's GPAs?",
       x= "Parent's Education Levels") +
  theme_light()

  • Both High school Dropouts (BHSD)

  • At least one high school graduate (1HS)

  • At least one did some college (1SC)

  • At least one bachelors degree (1BD)

  • At least one graduate degree (1GD)

  • Both graduate degrees (BGD)

To see full analysis this chart was used in: College GPA vs Parental Education

Visualization Recreation

Mimicing a visualization from Our World in Data

Original Graph

My Recreation:

Code
fruit <- read_csv("data/fruit.csv")
colnames(fruit) <- c('Entity', 'Code', 'Year', 'Fruit', 'GDP', 'Continent')

continent <- fruit %>% 
  filter(!is.na(Code)) %>% 
  group_by(Entity) %>% 
  fill(Continent, .direction='downup') %>% 
  ungroup()

library(ggrepel)
year <- continent %>% 
  filter(Year == 2020)

country5 <- year %>% 
  filter(Entity %in% c('Dominica',
                       'Dominican Republic',
                       'Guyana',
                       'Albania',
                       'Papua New Guinea',
                       'Ghana'))

ggplot(year, aes(x=GDP, y=Fruit, color=Continent)) +
  geom_point(size=2, shape=1, color='gray70') +
  geom_point(alpha=0.85) + 
  scale_x_continuous(trans='log',
                     breaks=c(1000, 2000, 5000, 10000, 20000, 50000, 100000),
                     labels=c("$1,000", "$2,000", "$5,000", "$10,000", "$20,000", "$50,000", "$100,000")) +
  scale_y_continuous(limits = c(0,400),
                     expand = c(0,0),
                     breaks=seq(0, 350, 50),
                     labels=c('0 kg', '50 kg', '100 kg',
                              '150 kg', '200 kg', '250 kg',
                              '300 kg', '350 kg')) +
  scale_color_manual(values = c('Africa' = "#9B559D",
                                'Asia' = "#32847E",
                                'Europe' = "#536A9D",
                                'North America' = "#D96C58",
                                'Oceania' = "#925026",
                                'South America' = "#802F39"))+
  guides(color = guide_legend(override.aes = list(shape=15, alpha=1,size=3),
                              keyheight = 0.9,
                              keywidth = 0)) +
  labs(title="Fruit consumption vs. GDP per capita,  2020",
       subtitle="Average per capita fruit consumption, measured in kilograms per year versus\ngross domestic product (GDP) per capita, measured in constant international-$",
       x="GDP per capita",
       y="Fruit supply per person") +
  theme_classic() +
  theme(
    panel.grid.major = element_line(linetype='dotted', color='gray70'),
    axis.line.y = element_blank(),
    axis.line.x = element_line(size=0.25),
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_blank(),
    legend.title = element_blank(),
    legend.justification = c(1,1),
    axis.title.x = element_text(vjust=-1),
    axis.title.y = element_text(vjust=4)
  )

plotly - Interactive Graphs

plotly code
Rent <- read_csv("data/rent.csv")

wRent <- Rent %>% 
  filter(Gender == 'F' & Price < 1000) %>% 
  mutate(
    MilesToCampus = round(MilesToCampus, 2)
  )

plot_ly(wRent,
        x= ~MilesToCampus,
        y= ~Capacity,
        color=~Price,
        colors=c("hotpink","hotpink4"), 
        size= ~Price,
        text= ~paste(Apartment, "\n$", Price)) %>% 
  layout(title= "Womens BYU-I Approved Housing\nUnder $1000 per Semester",
         xaxis=list(title="Miles to the Center of Campus"),
         yaxis=list(title="Maximum Housing Capacity"))

To see full analysis this chart was used in: Housing Analysis for Stephanie

Libraries
import pandas as pd
import numpy as np
import altair as alt

from IPython.display import Markdown
from IPython.display import display
from tabulate import tabulate

Pandas, Numpy, Altair

Code
names = pd.read_csv("https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv")

christian_names = names.query("name == ['Mary', 'Martha', 'Peter', 'Paul']")
christian_chart = alt.Chart(christian_names,
                            title = alt.Title(
                           "People Born Each Year",
                           subtitle= "with the names 'Martha', 'Mary', 'Paul', and 'Peter'"
                            )
).encode(
    x = alt.X('year',
              title = "Year")
            .axis(format = "d"),
    y = alt.Y('Total'),
    color = 'name'
)

christian_chart.mark_line()
Code
flights = pd.read_json("https://github.com/byuidatascience/data4missing/raw/master/data-raw/flights_missing/flights_missing.json")

# Gets rid of characters and just leaves numbers
flights['num_of_delays_carrier'] = (
    flights['num_of_delays_carrier'].str.replace(r'\D', '', regex=True)
)
# Replaces blank strings, -999, and "n/a" with the actual NaN value
flights = (flights
    .replace(["", -999, "n/a"], np.nan)
    .replace(["Febuary"], "February")
)
# Fills NaN values in num_of_delays_late_aircraft with the mean of the column
mean_late_air = flights.num_of_delays_late_aircraft.mean()
flights.num_of_delays_late_aircraft.fillna(mean_late_air, inplace=True)
# Fills NaN values with the month before them
flights.month.ffill(inplace=True)

totals = (flights
 .groupby("airport_code")
 .agg(
     total_minutes_delayed = 
     ("minutes_delayed_total", np.sum),
     total_delays = 
     ("num_of_delays_total", np.sum),
     total_flights = 
     ("num_of_flights_total", np.sum),
).assign(
    total_hrs_delayed = lambda df: df.total_minutes_delayed / 60,
    ave_hrs_delayed = lambda df: df.total_hrs_delayed / df.total_delays,
    proportion_delayed = lambda df: df.total_delays / df.total_flights,
    delay_rating = lambda df: df.proportion_delayed * df.ave_hrs_delayed
).sort_values('delay_rating', ascending=False)
.reset_index()
)

best_airport = alt.Chart(totals,
                         title= alt.Title(
                             "Airports Rated by Delay",
                             subtitle= "The higher the rating, the worse airport")).encode(
    x = alt.X('delay_rating:Q', title="Delay Rating"),
    y= alt.Y('airport_code:N', title="Airport Code", sort="-x"),
    color=alt.Color('airport_code:N', legend=None).scale(scheme="tealblues")
).mark_bar()
best_airport